import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# For getting the colour map
import matplotlib.cm as cm
#For clustering and creating dendogram
from scipy.cluster.hierarchy import dendrogram, linkage
# For labels of the dendogram plot
import pylab
# for scatter matrix of numerical variables
from pandas.plotting import scatter_matrix
#Import for running a t test
from scipy.stats import ttest_ind
#Import stats models for Logistic Regression for stastical Inference
import statsmodels.formula.api as smf
#Import DateTime to manipulate datetime columns
import datetime
# Import Plotly for graphs
import plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
In this section we define the common functions that we use for analysis of different columns.
In this section we analyse the columns for null and single values and drop the columns
We also do pre processing of some columns like zipcode, etc.
In this section is divided into the following subsections. We do the following for each column
percentiles, null values, outliers
Data Distribution - histogram, bar chart. box chart.
Univariate Analysis
Segmented univariate analysis
t-test (To conclude after the advanced statistics course)
Correlation analysis, Bivariate Analysis, Derived Variables.
Logistic Regression for statistical Inference
Most Affected Columns & Univariate Analysis
# Helper function to do analysis of numeric variable.
# Compute the percentile, number of null values, histogram, box plot by loan status
def do_analysis(dataframe, column_name):
print(dataframe[column_name].describe(percentiles=np.linspace(0,1,11)))
print("Number of null values for the column {0} : {1}".format(column_name, dataframe[column_name].isnull().sum()))
plt.figure(figsize=(10,10))
plt.subplot(311)
dataframe[column_name].plot(kind='hist', bins=50)
plt.subplot(312)
dataframe[column_name].plot(kind='box')
dataframe[[column_name, 'loan_status']].boxplot(by='loan_status')
# Helper function to print the analysis of categorical variables
# Get the value counts for the different values, generate a pivot table by loan status.
# Generate a bar chart by loan status for this categorical variable
def do_analysis_categorical(dataframe, column_name):
print(dataframe[column_name].value_counts())
# Segmented Univariate Analysis
pivot = pd.pivot_table(dataframe, values = 'policy_code',
index = [column_name], columns = ['loan_status'], aggfunc = np.sum, margins=True)
pivot['Paid Percentage'] = pivot['Fully Paid']/pivot['All']
pivot['Charged Off Percentage'] = 1 - pivot['Paid Percentage']
pivot['Charged Off Percentage'] = pivot['Charged Off Percentage'] * 100
pivot['Paid Percentage'] = pivot['Paid Percentage'] * 100
print(pivot)
pd.pivot_table(dataframe, values = 'policy_code', index = [column_name],
columns = ['loan_status'], aggfunc = np.sum).plot(kind='bar', stacked=True, figsize=(24, 8))
# Helper function to run a t test between charged_off and fully paid population.
def run_t_test(dataframe, column_name):
charged_off = dataframe.loc[dataframe['loan_status'] == 'Charged Off']
fully_paid = dataframe.loc[dataframe['loan_status'] == 'Fully Paid']
print(ttest_ind(charged_off[column_name], fully_paid[column_name]))
#All input dates are in MON-YY Format e.g: DEC-11
# This function converts the date into an integer
def changeDate2SequenceOfInt(inputDate):
if(pd.isnull(inputDate)):
return 0
else:
dt = datetime.datetime.strptime(inputDate,"%b-%y")
return int(str(dt.year) + str(dt.strftime('%m')))
# Get the percentage of null columns for the input column.
def checkNullPerForColumns(columnName):
print ("Null Percentage of the Column is: ==> ", round(100*(lending_club_master.loc[:,columnName].isnull().sum()/len(lending_club_master.loc[:,columnName].index)), 2))
#Plot CountPlot and TotalAmount Plot Based on LoanAmount
def pltUniVariableAnalysis(column, df):
plt.figure(figsize=(10,10))
y = 'loan_amnt'
hue = 'loan_status'
plt.subplot(2,1,1)
p = sns.barplot(x = column, y = y, hue=hue, data = df, estimator=np.sum)
p.set_xticklabels(p.get_xticklabels(), rotation = 90)
#plt.title(sumTitle)
plt.subplot(2,1,2)
q = sns.countplot(x = column, hue = hue, data = df)
q.set_xticklabels(q.get_xticklabels(), rotation = 90)
#plt.title(countTitle)
plt.show()
def createDict(columnName, dataFrame):
unique_columnValues = dataFrame.loc[:,columnName].unique()
unique_Count = len(unique_columnValues)
columnName_col2Val = dict(zip(unique_columnValues,np.arange(unique_Count)))
columnName_val2Col = dict(zip(np.arange(unique_Count), unique_columnValues))
return columnName_col2Val, columnName_val2Col
#Plot Categorical Based on Loan_Amount With Title as Parameter. Draw 2 Plots to Show the Comparision of Loan Status
def pltCategorical(columnName, title1, title2):
X1 = lending_club.pivot_table(index=columnName, values = 'loan_amnt', columns='loan_status', aggfunc='sum').reset_index()
plt.figure(figsize=(16,14))
plt.subplot(2, 1, 1)
p = sns.barplot(columnName, 'Charged Off', data = X1.sort_values(by = 'Charged Off', ascending = False))
p.set_xticklabels(p.get_xticklabels(),rotation=30)
plt.title(title1)
plt.subplot(2,1,2)
q = sns.barplot(columnName, 'Fully Paid', data = X1.sort_values(by = 'Fully Paid', ascending = False))
q.set_xticklabels(q.get_xticklabels(),rotation=30)
plt.title(title2)
plt.show()
#Create a DataFrame to Group LateFeePercent Based on Loan_Status
def createDF_LateFeePrecent_Categories(categoryColumn):
X1 = lending_club.pivot_table(index = categoryColumn, values = 'lateFeePrecent', columns=['loan_status'], aggfunc='mean').reset_index()
X1['LateFeePercent_diff'] = (X1['Charged Off'] - X1['Fully Paid'])
X1 = X1.sort_values(by='LateFeePercent_diff', ascending = False)
return X1
#Plot using Dataframe output from "createDF_LateFeePrecent_Categories" function
def pltLateFeePercentOnCategies(categoryColumn, dataFrame):
plt.figure(figsize=(15,8))
p = sns.barplot(x = categoryColumn,y='LateFeePercent_diff', data = dataFrame.sort_values(by='LateFeePercent_diff', ascending = False))
plt.ylabel('Charged-Off LateFee Percent On Fully Paid')
p.set_xticklabels(p.get_xticklabels(), rotation = 90, fontsize = 12)
plt.title("More LateFee Precent by ChargedOff Subscriber on FullyPaid Subscriber")
plt.show()
# percentile analysis
def percentileanalysis(column_name,hue):
temp = pd.Series(data = hue)
fig, ax = plt.subplots()
width = 18
fig.set_size_inches(width , 7)
ax = sns.countplot(data = lending_club, x= column_name, order=lending_club[column_name].value_counts().index,hue = hue)
if len(temp.unique()) > 0:
for p in ax.patches:
ax.annotate('{:1.1f}%'.format((p.get_height()*100)/float(len(lending_club))), (p.get_x()+0.05, p.get_height()+20))
else:
for p in ax.patches:
ax.annotate(p.get_height(), (p.get_x()+0.32, p.get_height()+20))
del temp
# Helper functions to do analysis of numeric variable and draw graphs using plotly
def do_analysis_with_plotly_graphs(dataframe, column_name, label):
print(dataframe[column_name].describe(percentiles=np.linspace(0,1,11)))
print("Number of null values for the column {0} : {1}".format(column_name, dataframe[column_name].isnull().sum()))
histogram = go.Histogram(x=dataframe[column_name], name=label)
boxplot = go.Box(y=dataframe[column_name], name=label)
fully_paid = dataframe.drop(dataframe[dataframe.loan_status != 'Fully Paid'].index)
charged_off = dataframe.drop(dataframe[dataframe.loan_status != 'Charged Off'].index)
paid_box = go.Box(y=fully_paid[column_name], name='Paid Off')
charged_off_box = go.Box(y=charged_off[column_name], name='Charged Off')
py.offline.iplot([histogram])
trace1 = go.Histogram(x=charged_off[column_name], name= 'Defaulted')
trace0 = go.Histogram(x=fully_paid[column_name], name= 'Paid')
data = [trace0, trace1]
layout = go.Layout(barmode='stack')
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)
data = [boxplot, paid_box, charged_off_box]
py.offline.iplot(data)
dist_data = [dataframe[column_name]]
dist_labels = [label]
fig = ff.create_distplot(dist_data, dist_labels)
py.offline.iplot(fig)
dist_data = [fully_paid[column_name], charged_off[column_name]]
dist_labels = ['Paid', 'Defaulted']
fig = ff.create_distplot(dist_data, dist_labels)
py.offline.iplot(fig)
#Helper Function to Plot the Top Category Distribution in FullyPaid and ChargedOff Subscriber
# Bar Plot shows the Percentage of Total Count Based on Category for total DataFrame (FullyPaid and ChargedOff)
# Line Plot shows the Percentage of Total Count Based on Category (ChargedOff)
def categorical_Percentage_Distribution(columnName, title):
totalCount = lending_club[columnName].count()
X1 = pd.DataFrame(lending_club.groupby(columnName).policy_code.agg(['count'])).reset_index()
X1['GroupPercentage'] = X1['count']/totalCount
lending_club_ChargedOff = lending_club.loc[lending_club['loan_status'] == 'Charged Off']
X2 = lending_club_ChargedOff.groupby(columnName).policy_code.agg(['count']).reset_index()
X2.rename(columns={'count':'GroupwiseChargedOffCount'}, inplace = True)
X2 = pd.merge(X1,X2, on = columnName)
X2['ChargedOffPercent'] = X2['GroupwiseChargedOffCount']/X2['count']
print(X2)
plt.figure(figsize=(12, 8))
ax = plt.subplot(111)
p = sns.barplot(x=columnName, y='GroupPercentage', data=X2)
p.set_xticklabels(p.get_xticklabels(), rotation = 90, fontsize= 12)
plt.ylabel('Percentage',fontsize = 12)
plt.xlabel(columnName, fontsize = 12)
plt.plot(X2['ChargedOffPercent'], 'r-')
plt.title(title, fontsize = 12)
ax.legend(loc='upper center', bbox_to_anchor=(0.8, 1.00), shadow=True, ncol=2, fontsize = 10)
plt.show()
lending_club = pd.read_csv('loan.csv')
print("Column Name Which Failed to Load ==> ", lending_club.columns[47])
customDataType = dict(zip(lending_club.columns,lending_club.dtypes)) #Get the Datatype
customDataType['next_pymnt_d'] = 'str'
lending_club = pd.read_csv('loan.csv',dtype=customDataType) #Reload
#Get Null% in each Column of loan dataframe
nullColumns = pd.DataFrame(round(100*(lending_club.isnull().sum()/len(lending_club.index)), 2), columns=['null%Column'])
#Create a Map Between Columns to Null% in DataFrame
dictColumn2NullPercent = dict(zip(nullColumns.index,nullColumns['null%Column']))
#Create the DataFrame with NullPercentage and BinaryCategory of Null and Not-Null
LendingClubCol_Df = pd.DataFrame(np.array(lending_club.columns), columns=['lending_club_columns'])
LendingClubCol_Df['NullPrecentage'] = LendingClubCol_Df['lending_club_columns'].apply(lambda x: dictColumn2NullPercent.get(x))
LendingClubCol_Df['NullPrecentageMoreThan95%'] = LendingClubCol_Df['NullPrecentage'].apply(lambda x: 'Null_Columns' if x > 95.0 else 'Non Null_Columns')
LendingClubCol_Df.head()
#Plot Each Column with NUll %
plt.figure(figsize=(24,10))
plt.subplot(2,1,1)
p = sns.barplot(x = 'lending_club_columns', y = 'NullPrecentage', data = LendingClubCol_Df)
plt.xlabel('Null Percentage Of Columns', fontsize = 20)
plt.ylabel('Null %', fontsize= 20)
p.set_xticklabels(p.get_xticklabels(),rotation=90, fontsize = 15)
plt.title('Columns to Null%')
plt.show()
#Plot a Count Plot to see how many with maximum NULL value and how many columns can be considered for the analysis
plt.figure(figsize=(6, 15))
plt.subplot(2, 1, 1)
q = sns.countplot(x='NullPrecentageMoreThan95%', data=LendingClubCol_Df)
q.set_xticklabels(q.get_xticklabels(),rotation=30)
plt.title('Null Column Analysis')
plt.show()
LendingClubCol_Df = LendingClubCol_Df.groupby('NullPrecentage').lending_club_columns.agg(['count']).sort_values(by='count', ascending=False).reset_index()
plt.figure(figsize=(18,6))
p = sns.barplot(x = 'NullPrecentage', y = 'count', data = LendingClubCol_Df)
plt.xlabel('Null Percentage Of Columns', fontsize = 12)
plt.ylabel('Count of Null %', fontsize= 12)
plt.title('Columns to Null Percentage')
plt.show()
#np Vectorized Function to Filter Null Columns from the Loan Dataframe
Func_filterNullCol = np.vectorize(lambda x: True if dictColumn2NullPercent.get(x) < 100 else False)
Func_filterNullCol(np.array(lending_club.columns))
#Drop All the Null COlumns
lending_club = lending_club.loc[:,Func_filterNullCol(np.array(lending_club.columns))]
lending_club.head()
unncessaryColumns = ['desc', 'title', 'initial_list_status','application_type',
'acc_now_delinq', 'delinq_amnt', 'pymnt_plan', 'url']
lending_club.drop(columns=unncessaryColumns, inplace=True)
print("********Columns Considered for Analysis as follows:====>\n ", lending_club.columns)
print("********Total Number of Columns Considered:====> ", len(lending_club.columns))
==> int_rate¶lending_club['int_rate'] = round(lending_club['int_rate'].apply(lambda x: (str(x).split("%")[0])).apply(pd.to_numeric),0)
==> installment¶lending_club['installment'] = round(lending_club['installment'].apply(pd.to_numeric),0)
==> emp_length¶#Remove > and < Symbol and < 1 Value set to 0
lending_club['emp_length'] = lending_club['emp_length'].apply(lambda x: str(x).split("year")[0].\
strip().split("+")[0]).apply(lambda x: 0 if str(x).find("<") >=0 else x)
==> home_ownership¶# Only 3 Records in NONE Category with Full Paid, Hence same record is discarded
lending_club = lending_club[lending_club['home_ownership'] != 'NONE']
==> issue_d, earliest_cr_line¶#lending_club['issue_d'] = lending_club['issue_d'].apply(changeDate2SequenceOfInt)
#lending_club['earliest_cr_line'] = lending_club['earliest_cr_line'].apply(changeDate2SequenceOfInt)
==> zip_code¶lending_club['zip_code'] = lending_club['zip_code'].apply(lambda x: str(x)[0:3])
==> next_pymnt_d, last_pymnt_d, last_credit_pull_d¶lending_club['next_pymnt_d'] = lending_club['next_pymnt_d'].apply(changeDate2SequenceOfInt)
#lending_club['last_pymnt_d'] = lending_club['last_pymnt_d'].apply(changeDate2SequenceOfInt)
lending_club['last_credit_pull_d'] = lending_club['last_credit_pull_d'].apply(changeDate2SequenceOfInt)
==> Data Cleanup loan_status
Since the Current loan statuses cannot be used to determine whether a loan is paid or charged off,
We will remove the loan_status with the value Current. We will save it in a separate dataframe for future analysis
lending_club_master = lending_club
lending_club_current = lending_club.loc[lending_club['loan_status'] == 'Current']
lending_club = lending_club.loc[lending_club['loan_status'] != 'Current']
lending_club_ChargedOff = lending_club.loc[lending_club['loan_status'] == 'Charged Off']
loan_amnt¶do_analysis(lending_club, 'loan_amnt')
The loan amount has spikes at the round numbers like 5, 000$, 10,000 $, 15, 000$, 20, 000$ 25, 000 $ etc.
This can be an input to the UX team or the Business Development team to create loan packages at these round numbers.
run_t_test(lending_club, 'loan_amnt')
Visually, there seems to be a slightdifference in the means of the loan amount for charged off and fully paid.
However, T Test indicates that loan amount has a bearing on the loan defaults
loan_amnt¶lending_club[['annual_inc', 'loan_amnt']].corr()
The Loan amount is corelated positively with the annual income
funded_amnt¶# Is funded_amnt same as loan amnt.
#What % of rows where funded amount and loan amount are not same.
percent_difference = lending_club.loc[lending_club['loan_amnt'] != lending_club['funded_amnt']]['funded_amnt'].shape[0]/ lending_club.shape[0]
print("% .2f" % (percent_difference * 100))
# Is the monetary difference more. yes
(lending_club['loan_amnt'] - lending_club['funded_amnt']).sum()
do_analysis(lending_club, 'funded_amnt')
Charged off has a higher average funded amount than the fully paid segment
Also, since the loan_amount and funded_amount are highly corelated, and there is not a lot a difference, we can just use the funded_amount column and drop the loan amount column.
# corelation between loan amount and funded amount
lending_club[['loan_amnt', 'funded_amnt']].corr()
funded_amnt_inv¶do_analysis(lending_club, 'funded_amnt_inv')
term¶lending_club['term'].value_counts()
do_analysis_categorical(lending_club, 'term')
11% of customers with tenure of 36 months default
25.42 % of the customers with tenure of 60 months default
Higher tenure leads to higher default rates.
installment¶lending_club['installment'].value_counts().head()
#lending_club['installment'].isnull().sum()
do_analysis(lending_club, 'installment')
# corelation between loan amount and funded amount
lending_club[['loan_amnt', 'installment']].corr()
#categorizing the installment to bins to get better insights
def categorizefunc(x):
if (x>0 and x<=200) :
return '0-200'
elif (x>200 and x<=400):
return '200-400'
elif(x>400 and x<=600):
return '400-600'
elif(x>600 and x<=800):
return '600-800'
elif(x>800 and x<=1000):
return '800-1000'
else:
return '1000>'
lending_club['categorical_installment']=lending_club['installment'].apply(categorizefunc)
lending_club['categorical_installment']
do_analysis_categorical(lending_club, 'categorical_installment')
We will not remove any row for installment as all the rows seem valid.
correlation between installment and loan_amount is high. more number of customers pay installment around $180. The number of fully paid loans are with installments between 200 to 400
grade¶pltUniVariableAnalysis('grade',lending_club)
pltUniVariableAnalysis('grade',lending_club_ChargedOff)
sub_grade¶pltUniVariableAnalysis('sub_grade', lending_club.sort_values(by='sub_grade'))
pltUniVariableAnalysis('sub_grade', lending_club_ChargedOff.sort_values(by='sub_grade'))
emp_length¶pltUniVariableAnalysis('emp_length',lending_club)
home_ownership¶pltUniVariableAnalysis('home_ownership',lending_club_master)
pltUniVariableAnalysis('home_ownership', lending_club_ChargedOff)
columnName = 'home_ownership'
title1 = 'Loan Amount Distribution Based on Home Ownership for ChargedOff Subscriber'
title2 = 'Loan Amount Distribution Based on Home Ownership for Fully Paid Subscriber'
pltCategorical(columnName, title1, title2)
annual_inc¶do_analysis(lending_club, 'annual_inc')
# 5th percentile
lending_club.annual_inc.quantile(0.05)
#What percent of values fall below the 5th percentile?
percent = lending_club.loc[lending_club['annual_inc'] < 24000].shape[0] * 100 / lending_club.shape[0]
print("% .2f" % percent)
# 95th percentile
lending_club.annual_inc.quantile(0.99)
# Percentage of values that fall in above the 99th percentile (i.e top 1 percentile)
percent = lending_club.loc[lending_club['annual_inc'] > 234999].shape[0] * 100/ lending_club.shape[0]
print("% .2f" % percent)
lending_club['annual_inc'].describe()
The spread is between around 40,000 and 82,300. We will drop the outliers top 99 percentile. We will not remove the bottom 5 percentile because the lenders can be from the low income group. Also below the 5th percentile, there are 5% of rows. so we will not remove the bottom 5th percentile.
# Removing the top 1 percentile of outliers
lending_club = lending_club.loc[lending_club['annual_inc'] < 234999]
issue_d¶lending_club['issue_d']
lending_club['issue_month']=lending_club['issue_d'].apply(lambda x:x.split('-')[0])
lending_club['issue_year']=lending_club['issue_d'].apply(lambda x:x.split('-')[1])
lending_club['issue_year'] = lending_club['issue_year'].apply(pd.to_numeric)
percentileanalysis('issue_year','loan_status')
No clean up done for issue_d as there are no null values.
loan_status¶lending_club_master.pivot_table(values='loan_amnt', index='loan_status', aggfunc='count').plot(kind='bar', stacked=True, figsize=(20, 8), fontsize=15)
purpose¶lending_club_ChargedOff.pivot_table(values='loan_amnt', index='purpose', aggfunc='sum').sort_values(by='loan_amnt', ascending=False).plot(kind='bar', stacked=True, figsize=(20, 8), fontsize=15)
columnName = 'purpose'
title1 = 'Loan Amount Distribution Based on "Purpose" of Loan, for ChargedOff Subscriber'
title2 = 'Loan Amount Distribution Based on "Purpose" of Loan, for Fully Paid Subscriber'
pltCategorical(columnName, title1, title2)
addr_state¶df1_addr_state = lending_club_ChargedOff.pivot_table(values='policy_code', index='addr_state', aggfunc='count').sort_values(by='policy_code', ascending=False)
df1_addr_state.plot(kind='bar', stacked=True, figsize=(20, 8), fontsize=15)
df1_addr_state = df1_addr_state.reset_index()
plt.figure(figsize=(15,5))
plt.plot(df1_addr_state.addr_state, df1_addr_state.policy_code, 'o-')
plt.show()
columnName = 'addr_state'
title1 = 'Loan Amount Distribution Based on "addr_state" of Loan, for ChargedOff Subscriber'
title2 = 'Loan Amount Distribution Based on "addr_state" of Loan, for Fully Paid Subscriber'
pltCategorical(columnName, title1, title2)
dti¶do_analysis(lending_club, 'dti')
* Since there are no null values, and no outliers for dti, there is no need to do a datacleanup
earliest_cr_line¶lending_club['earliest_cr_line']
lending_club['earliestcrline_month']=lending_club['earliest_cr_line'].apply(lambda x:x.split('-')[0])
lending_club['earliestcrline_year']=lending_club['earliest_cr_line'].apply(lambda x:x.split('-')[1])
lending_club['earliestcrline_year'] = lending_club['earliestcrline_year'].apply(pd.to_numeric)
do_analysis_categorical(lending_club, 'earliestcrline_year')
Since there are no null values no cleanup is needed.
Business for banks increased in terms of number of loans, can be predicted to increase in coming years in similar format.
inq_last_6mths¶df1_inq_last_6mths = lending_club_ChargedOff.pivot_table(values='policy_code', index='inq_last_6mths', aggfunc='count').reset_index()
plt.figure(figsize=(15,5))
plt.plot(df1_inq_last_6mths.inq_last_6mths, df1_inq_last_6mths.policy_code, '-o')
plt.show()
* Number of Inquary Count is more from (0 to 4) in last 6 months period
mths_since_last_delinq¶do_analysis(lending_club, 'mths_since_last_delinq')
# There are 24675 null values. The null values means that the customer has been repaying the loan on time.
# We will do this data cleanup by imputing these null values to 0.
lending_club.loc[lending_club['mths_since_last_delinq'].isnull(),'mths_since_last_delinq'] = 0
# Analysis of mths_since_last_delinq excluding the 0 values.
lending_club.loc[lending_club['mths_since_last_delinq'] != 0]['mths_since_last_delinq'].plot(kind='hist', bins=50)
lending_club.loc[lending_club['mths_since_last_delinq'] != 0][['mths_since_last_delinq', 'loan_status']].boxplot(by='loan_status')
* Charged off loans have a higher months since last delinquient
open_acc¶#do_analysis(lending_club, 'open_acc')
lending_club['open_acc'].plot(kind='hist')
#DERIVED
lending_club['Ratio_Open_Total_acc']=(lending_club['open_acc']/lending_club['total_acc'])*100
#lending_club['Ration_Open_Total_acc']
do_analysis(lending_club, 'Ratio_Open_Total_acc')
No Data Clean up done.
1. There are profiles with as 10 open accounts is the highest.
2. There are outliers with more than 20 open accounts .
3. Out of 3000 profiles around 800 have the open_acc/total_acc ratio as 100% ie all loans are either Current/ChargedOff
4. These profiles should never be considered for next loan request,Also need to check for 75% and above cases.
</b>
pub_rec¶#lending_club_master.pivot_table(values='policy_code', index='pub_rec', aggfunc='count', columns=['loan_status']).plot(kind='bar', stacked=True, figsize=(20, 8), fontsize=15)
pltUniVariableAnalysis('pub_rec', lending_club_master)
pltUniVariableAnalysis('pub_rec', lending_club_ChargedOff)
revol_bal¶do_analysis(lending_club, 'revol_bal')
# 90% of the revol_bal values are under 28, 541 dollars. The max loan amount is around 35, 000 dollars.
# It is likely that the values above 80, 000 are outliers, and we can remove them.
# Lets look at the percentage of rows where revol_bal is above 80,000
lending_club.loc[lending_club['revol_bal'] > 80000].shape[0] * 100 / lending_club.shape[0]
# To decide if we can drop these rows we will see if there is a major correlation between
# revol_bal and charged off for values over 60,000
lending_club.loc[lending_club['revol_bal'] > 80000][['revol_bal', 'loan_status']].boxplot(by='loan_status')
# Since there is a visible corelation between revolving balance and loan status, we cannot remove the values above 80,000
# The values above 80, 000 seem to be valid values.
lending_club[['revol_bal', 'loan_status']].groupby('loan_status').quantile([0.5])
* The median revolving balance for charged off loans is 6.3% higher than the fully paid loans.
# Is revol_bal and revol_util corelated. They are not highly corelated.
lending_club[['revol_bal','revol_util']].corr()
total_acc¶lending_club['total_acc']
do_analysis(lending_club, 'total_acc')
no data clean up done as no null values or incorrect values
There are around 2800 profiles with total accounts around 18 All profiles with charged off/current/Fully paid status have 20 total accounts on an average
out_prncp_inv¶lending_club['out_prncp_inv'].value_counts()
do_analysis(lending_club,'out_prncp_inv')
#since most of the outstanding prnc is 0 need to filter out to get better insights
lending_club_temp=lending_club.loc[lending_club['out_prncp_inv'] > 0]
#lending_club_temp['out_prncp_inv']
#do_analysis(lending_club_temp,'out_prncp_inv')
Most of the profiles have dollars 600 as the outstanding_prncp_inv while the next peak is at dollars 1200 . Highest outstanding prncipal is around dollars 6300
total_pymnt¶print("Total Number of Records: ", lending_club_master['total_pymnt'].shape[0])
print("Total Unique Counts of the Records: ", len(lending_club_master['total_pymnt'].unique()))
do_analysis(lending_club, 'total_pymnt')
def splitTotalPayment(x):
if x <= 5000:
return "Low"
elif x <= 15000:
return "Medium"
else:
return "High"
lending_club_master['Temp_Total_Payment'] = lending_club_master['total_pymnt'].apply(splitTotalPayment)
pltUniVariableAnalysis('Temp_Total_Payment', lending_club_master)
lending_club_master.drop(columns='Temp_Total_Payment', inplace=True)
total_rec_int¶#lending_club['total_rec_int'].value_counts()
do_analysis(lending_club,'total_rec_int')
Though there are outliers , removing them will not have a significant impact hence retaining them.
High number of profiles around 6500 have paid dollars 0.0 interest : reason has to be found and such profiles should be handled sepearately.
There are few profiles having paid around dollars 23000 : can checked if principal
total_rec_late_fee¶checkNullPerForColumns('total_rec_late_fee')
totalChargedOffSub = lending_club_ChargedOff['total_rec_late_fee'].shape[0]
positiveLatePayment = len(lending_club_ChargedOff[lending_club_ChargedOff['total_rec_late_fee'] > 0])
print("Total Number of Subscriber Records: ==>", totalChargedOffSub)
print("Total Unique Counts of the Records: ==>", len(lending_club_ChargedOff['total_rec_late_fee'].unique()))
print("Total Number of +ve Late Payment, ChargedOff Subscriber: ==>",positiveLatePayment)
print("% of ChargedOff Subscriber Done Late Payment: ==>", round((positiveLatePayment/totalChargedOffSub)*100, 0))
print(lending_club_master.groupby(['loan_status'])['total_rec_late_fee'].agg(['count', 'sum']))
recoveries¶do_analysis(lending_club, 'recoveries')
# Recovery is non zero for only charged off loans. So we will analyse recovery for Charged Off loans
lending_club.loc[lending_club['recoveries'] > 0]['recoveries'].plot(kind='hist')
# There are no null values for recoveries. Therefore there is no need for data cleanup.
# Recoveries is 0 for fully paid loans
lending_club[['recoveries', 'loan_status']].groupby('loan_status').quantile([.5])
last_pymnt_d¶lending_club['last_pymnt_d']
lending_club['last_pymnt_d']=lending_club['last_pymnt_d'].astype(str)
lending_club['last_pymnt_month']=lending_club['last_pymnt_d'].apply(lambda x:x.split('-')[0])
#lending_club['last_pymnt_year']=lending_club['last_pymnt_d'].apply(lambda x:x.split('-')[1] if x.find("-") else 0)
#lending_club['last_pymnt_year'].apply(pd.to_numeric)
#lending_club['last_pymnt_month']
#lending_club['last_pymnt_d']
#lending_club['last_pymnt_year']
#lending_club['last_pymnt_year']=lending_club['last_pymnt_d'].apply(lambda x:x.split('-')[1])
last_pymnt_amnt¶checkNullPerForColumns('last_pymnt_amnt')
totalChargedOffSub = lending_club_ChargedOff['last_pymnt_amnt'].shape[0]
positiveLastPaymnt = len(lending_club_ChargedOff[lending_club_ChargedOff['last_pymnt_amnt'] > 0])
print("Total Number of Subscriber Records: ==>", totalChargedOffSub)
print("Total Unique Counts of the Records: ==>", len(lending_club_ChargedOff['last_pymnt_amnt'].unique()))
print("Total Number of +ve Late Payment, ChargedOff Subscriber: ==>",positiveLastPaymnt)
do_analysis(lending_club_ChargedOff, 'last_pymnt_amnt')
def splitLatePayment(x):
if x <= 5000:
return "Low"
elif x <= 15000:
return "Medium"
else:
return "High"
lending_club_master['Temp_Total_Payment'] = lending_club_master['total_pymnt'].apply(splitTotalPayment)
pltUniVariableAnalysis('Temp_Total_Payment', lending_club_master)
lending_club_master.drop(columns='Temp_Total_Payment', inplace=True)
collections_12_mths_ex_med¶lending_club=lending_club[lending_club.collections_12_mths_ex_med.notnull()]
lending_club['collections_12_mths_ex_med'].isnull().sum()
do_analysis_categorical(lending_club,'collections_12_mths_ex_med')
#lending_club['collections_12_mths_ex_med']
int_rate¶do_analysis_with_plotly_graphs(lending_club, 'int_rate', 'Interest Rate')
Borrowers who defaulted usually had a higher interest rate compared to those that did not.
emp_title¶Even tho the discriptions for the row emp_title says its the job title supplied by the person taking the loan it turns out to be the name of the employer.
Lets start by cleaning the data a bit and then looking at it.
To clean the data the process will be to see the different ways the loan applicants have spelled the different employers and fix those. This will be an iterative process.
lending_club.emp_title = lending_club.emp_title.str.lower()
lending_club.emp_title = lending_club.emp_title.str.strip()
lending_club.emp_title = lending_club.emp_title.str.replace('united states army', 'us army')
lending_club.emp_title = lending_club.emp_title.str.replace('u.s. army', 'us army')
lending_club.emp_title = lending_club.emp_title.str.replace('self-employed', 'self employed')
lending_club.emp_title = lending_club.emp_title.str.replace('self -employed', 'self employed')
lending_club.emp_title = lending_club.emp_title.str.replace('self employed', 'self')
lending_club.emp_title = lending_club.emp_title.str.replace('wells fargo bank', 'wells fargo')
lending_club.emp_title = lending_club.emp_title.str.replace('jpmorgan chase', 'jp morgan chase')
lending_club.emp_title = lending_club.emp_title.str.replace('united states postal service', 'usps')
lending_club.emp_title = lending_club.emp_title.str.replace('us air force', 'usaf')
lending_club.emp_title = lending_club.emp_title.str.replace('united states air force', 'usaf')
lending_club.emp_title = lending_club.emp_title.str.replace('united states navy', 'us navy')
lending_club.emp_title = lending_club.emp_title.str.replace('us postal service', 'usps')
lending_club.emp_title = lending_club.emp_title.str.replace('united parcel service', 'ups')
lending_club.emp_title = lending_club.emp_title.str.replace('wal-mart', 'walmart')
lending_club.emp_title = lending_club.emp_title.str.replace('department of defence (dod)', 'department of defence')
lending_club.emp_title = lending_club.emp_title.str.replace('department of defence, usaf', 'department of defence')
lending_club.emp_title = lending_club.emp_title.str.replace('department of defence oig', 'department of defence')
lending_club.emp_title = lending_club.emp_title.str.replace('department of defence - nawctsd', 'department of defence')
lending_club.emp_title = lending_club.emp_title.str.replace('united states marine corps', 'usmc')
employers = lending_club.groupby('emp_title').count()
employers=employers.sort_values(by=['id'], ascending=False)
employers['id'].head(50)
Looking at this data its quite clear that people that work for the goverment be it in the military or postal service or other goverment agencies seem to taking the maximum number of loans.
verification_status¶do_analysis_categorical(lending_club, 'verification_status')
From the above data and chart it is very clear that verified users are more likely not to pay back their loans.
delinq_2yrs¶do_analysis_categorical(lending_club, 'delinq_2yrs')
From the above data it is clear that users that have been delinquent in the last 2 years are more likely not to pay back their loan.
lending_club.revol_util = lending_club.revol_util.str.replace('%', '')
lending_club.revol_util = pd.to_numeric(lending_club.revol_util)
lending_club_revol_util = lending_club.dropna(subset=['revol_util'])
revol_util¶do_analysis_with_plotly_graphs(lending_club_revol_util, 'revol_util', 'revoling utilization')
What is very interesting here is that the distribution of charged off loans increases after the utilization crosses 50% and it really pickes up after 70%
pub_rec_bankruptcies¶do_analysis_categorical(lending_club, 'pub_rec_bankruptcies')
Users with a public record of a bankruptcies are more likely not to pay their loans.
# Creating a function to segment customers based on their income levels
# LOW is less than 30, 000, MID is between 30,000 and 90, 000, HIGH is above 90, 000
def get_income_segment(income):
if(income < 30000):
return 'LOW'
if((income >= 30000) & (income < 90000)):
return 'MID'
if(income >= 90000):
return 'HIGH'
#Segment the customers by income level
lending_club['income_level'] = lending_club.apply(lambda row : get_income_segment(row['annual_inc']), axis=1)
# Segmented Univariate Analysis
pd.pivot_table(lending_club, values = 'policy_code', index = ['income_level'], columns = ['loan_status'], aggfunc = np.sum)
# Segmented Univariate Analysis
pd.pivot_table(lending_club, values = 'policy_code', index = ['loan_status'], columns = ['income_level'], aggfunc = np.sum).plot(kind='pie', subplots=True, figsize=(24, 8))
# Segmented Univariate Analysis. todo create barcharts so that all bars are of same height
pd.pivot_table(lending_club, values = 'policy_code', index = ['income_level'], columns = ['loan_status'], aggfunc = np.sum).plot(kind='bar', stacked=True, figsize=(24, 8))
10% : HIGH Income segment have 841/(6808 + 218 + 841)
14% : MID Income Segment have 4043 / (4043 + 789 + 22758)
18% : LOW Income Segment has 700 / (700 + 56 + 3043)
</font>
This shows that the Loan default increases in the lower income segment.
lending_club.loc[(lending_club['income_level'] == 'LOW') & (lending_club['loan_status'] == 'Charged Off')]['loan_amnt'].plot(kind='hist', bins=50)
# I also want to analyse if the people who ask more, but get less.
# Is there is corelation between this difference and chargeoff
lending_club['asking_lending_difference'] = lending_club['loan_amnt'] - lending_club['funded_amnt']
do_analysis(lending_club, 'asking_lending_difference')
# Where asking_lending_difference is greater than 0
lending_club[['asking_lending_difference', 'loan_status']].loc[lending_club['asking_lending_difference'] > 0].boxplot(by='loan_status')
The charged off loans had asked for a higher amount but got lower loan amount.
run_t_test(lending_club, 'asking_lending_difference')
corr = lending_club[['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'installment',
'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
'total_rec_int', 'annual_inc', 'revol_bal','last_pymnt_amnt',
'total_rec_late_fee', 'recoveries', 'collection_recovery_fee','dti']].corr()
fig, ax = plt.subplots(figsize=(15,15))
ax.matshow(corr, cmap=cm.get_cmap('coolwarm'), vmin=0, vmax=1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation='vertical', fontsize=16);
plt.yticks(range(len(corr.columns)), corr.columns, fontsize=16);
Z = linkage(corr, 'average')
plt.figure(figsize=(25, 10))
labelsize=20
ticksize=15
plt.title('Hierarchical Clustering Dendrogram for loan variables', fontsize=labelsize)
plt.xlabel('variable', fontsize=labelsize)
plt.ylabel('distance', fontsize=labelsize)
dendrogram(
Z,
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=8., # font size for the x axis labels
labels = corr.columns
)
pylab.yticks(fontsize=ticksize)
pylab.xticks(rotation=-90, fontsize=ticksize)
#plt.savefig('img/dendogram_'+index+'.png')
plt.show()
fig, ax = plt.subplots(figsize=(17,17))
colours = {'Fully Paid':'green', 'Charged Off':'red', 'Current':'blue'}
ax.scatter(lending_club['loan_amnt'], lending_club['annual_inc'], c=lending_club['loan_status'].apply(lambda x: colours[x]), alpha=0.5)
Plot to see if there is a co-relation between the loan amount and annual income, and see if there are any clusters
of defaults. We see that the loan amount is co-related to annual income, however there is no pattern or clusters of
defaults
At lower income levels - below 10000 USD and higher loan amounts - above 15, 000 USD, there seems to be a concentration of defaults.
Grade, SubGrade, Interest Rate with Respect to Other Columns (BiVariate)¶#Grade, Sub-Grade to Loan Percentage, As per the Graph, Upper Grade is Having Higher Interest Rate
lending_club.groupby(['grade', 'sub_grade'])['int_rate'].agg(['mean']).sort_values(by='mean').plot(kind='bar', figsize=(18,7))
#Create a Single Column by Merging grade and subgrade
lending_club['grade_subgrade'] = lending_club['grade'] + "_" + lending_club['sub_grade']
lending_club['grade_subgrade'].nunique()
#Create a Derive Column for the LateFeePercent With Respect to TotalPayment
lending_club['lateFeePrecent'] = round((lending_club['total_rec_late_fee']/lending_club['total_pymnt'])*100,2)
X1 = lending_club.groupby('loan_status'). lateFeePrecent.agg(['mean']).reset_index()
plt.figure(figsize=(8,8))
sns.barplot(x = 'loan_status', y='mean', data = X1)
plt.xlabel('Loan Status', fontsize = 12)
plt.ylabel('Late Fee Perecent', fontsize = 12)
plt.show()
#Get Average Interest Rate Based on grade and Subgrade
group_subgroup_intRate = pd.DataFrame(lending_club.groupby('grade_subgrade').int_rate.agg(['mean']).reset_index())
#Merge the new Column AvgIntRate to the main Frame Based on grade-subgrade
lending_club = pd.merge(lending_club,group_subgroup_intRate, how = 'inner', on = ['grade_subgrade']).rename(columns = {'mean':'avgIntRate'})
plt.figure(figsize=(8,8))
sns.boxplot('loan_status', 'avgIntRate', data = lending_club)
plt.xlabel('Loan Status', fontsize = 12)
plt.ylabel('Average Interest Rate', fontsize = 12)
plt.show()
#Grade impact on LateFee Precentage
X1 = createDF_LateFeePrecent_Categories('grade')
X1.head()
#Plot Grade impact on LateFee Precentage based on LoanCategory
pltLateFeePercentOnCategies('grade',X1)
#grade_subgrade impact on LateFee Precentage
X1 = createDF_LateFeePrecent_Categories('grade_subgrade')
X1.head()
#Plot grade_subgrade impact on LateFee Precentage based on LoanCategory
pltLateFeePercentOnCategies('grade_subgrade',X1)
"Home Ownership" with respect to Other Columns (BiVariate)¶#home_ownership impact on LateFee Precentage
X1 = createDF_LateFeePrecent_Categories('home_ownership')
X1.head()
#Plot home_ownership impact on LateFee Precentage based on LoanCategory
pltLateFeePercentOnCategies('home_ownership',X1)
"purpose" with respect to Other Columns (BiVariate)¶#purpose impact on LateFee Precentage
X1 = createDF_LateFeePrecent_Categories('purpose')
X1.head()
#Plot purpose impact on LateFee Precentage based on LoanCategory
pltLateFeePercentOnCategies('purpose',X1)
"addr_state" with respect to Other Columns (BiVariate)¶#addr_state impact on LateFee Precentage
X1 = createDF_LateFeePrecent_Categories('addr_state')
X1.head()
#Plot purpose impact on LateFee Precentage based on LoanCategory
pltLateFeePercentOnCategies('addr_state',X1)
"inq_last_6mths" with respect to Other Columns (BiVariate)¶#inq_last_6mths impact on LateFee Precentage
X1 = createDF_LateFeePrecent_Categories('inq_last_6mths')
X1.head()
#Plot purpose impact on LateFee Precentage based on LoanCategory
pltLateFeePercentOnCategies('inq_last_6mths',X1)
"pub_rec" with respect to Other Columns (BiVariate)¶#pub_rec impact on LateFee Precentage
X1 = createDF_LateFeePrecent_Categories('pub_rec')
X1.head()
#Plot purpose impact on LateFee Precentage based on LoanCategory
pltLateFeePercentOnCategies('pub_rec',X1)
def convert_loan_status_to_binary(loan_status):
if(loan_status == 'Charged Off'):
return 1;
if(loan_status == 'Fully Paid'):
return 0
lending_club['loan_status_binary'] = lending_club['loan_status'].apply(lambda x:convert_loan_status_to_binary(x))
# Normalize the columns so that they lie between 0 and 1
# This will help ensure that the coefficients of logistic regression follow the same scale for comparison
lending_club_normalized = lending_club.copy()
#int_rate, revol_util
for feature_name in ['annual_inc', 'loan_amnt', 'dti', 'revol_bal', 'delinq_2yrs', 'pub_rec_bankruptcies', 'funded_amnt', 'open_acc']:
max_value = lending_club[feature_name].max()
min_value = lending_club[feature_name].min()
lending_club_normalized[feature_name] = (lending_club[feature_name] - min_value) / (max_value - min_value)
#int_rate + revol_util
lreg = smf.logit(formula ='loan_status_binary ~ annual_inc + loan_amnt + dti + revol_bal + C(income_level) + delinq_2yrs + pub_rec_bankruptcies + funded_amnt + C(term) + open_acc', data = lending_club_normalized).fit()
print(lreg.summary2())
print("Sorted odds ratios of each variable")
print((np.exp(lreg.params)).sort_values(ascending=False))
print("lower and upper confidence intervals")
print(lreg.conf_int())
delinq_2yrs¶categorical_Percentage_Distribution('delinq_2yrs', '% Distribution of delinq_2yrs and ChargedOff Subscriber% on Total')
pub_rec_bankruptcies¶categorical_Percentage_Distribution('pub_rec_bankruptcies', 'Distribution of pub_rec_bankruptcies and ChargedOff Subscriber% on Total')
term¶categorical_Percentage_Distribution('term', 'Distribution of term column and ChargedOff Subscriber% on Total')
pub_rec_bankruptcies¶categorical_Percentage_Distribution('pub_rec_bankruptcies', 'Distribution of term column and ChargedOff Subscriber on Total')
revol_bal¶print(lending_club['revol_bal'].describe(percentiles = np.arange(0.0,1.0,0.05)))
plt.figure(figsize=(15,7))
sns.distplot(lending_club['revol_bal'])
plt.show()
#Split RevolBal in to 3 Category, 0-3634 (25%), 3634 to 8710(50%), 8710 to 16741 (75%) and >16741 (Above 75%)
def splitRevolBal2Category(x):
if(x <= 3634):
return 'Qtr1'
elif((x > 3634) & (x <=8710)):
return 'Qtr2'
elif((x > 8710) & (x <=16741)):
return 'Qtr3'
else:
return 'Qtr4'
lending_club['Revol_Bal_Category'] = lending_club['revol_bal'].apply(splitRevolBal2Category)
categorical_Percentage_Distribution('Revol_Bal_Category', 'Distribution of Revol_Bal_Category column and ChargedOff Subscriber% on Total')
Annual_Inc¶print(lending_club['annual_inc'].describe(percentiles = np.arange(0.0,1.0,0.05)))
plt.figure(figsize=(15,7))
sns.distplot(lending_club['annual_inc'])
plt.show()
#Split Annual Income in to 4 Category, 0-40000 (25%), 40000 to 58000(50%), 58000 to 80140 (75%) and >80140 (Above 75%)
def splitAnnualIncome2Category(x):
if(x <= 40000):
return 'Qtr1'
elif((x > 40000) & (x <=58000)):
return 'Qtr2'
elif((x > 58000) & (x <=80140)):
return 'Qtr3'
else:
return 'Qtr4'
lending_club['annual_inc_Category'] = lending_club['annual_inc'].apply(splitAnnualIncome2Category)
categorical_Percentage_Distribution('annual_inc_Category', 'Distribution of annual_inc_Category column and ChargedOff Subscriber% on Total')
dti¶print(lending_club['dti'].describe(percentiles = np.arange(0.0,1.0,0.05)))
plt.figure(figsize=(15,7))
sns.distplot(lending_club['dti'])
plt.show()
#Split Dti in to 4 Category, 0-8 (25%), 8 to 13(50%), 13 to 17 (75%) and >17 (Above 75%)
def splitDtiCategory(x):
if(x <= 8):
return 'Qtr1'
elif((x > 8) & (x <=13)):
return 'Qtr2'
elif((x > 13) & (x <=17)):
return 'Qtr3'
else:
return 'Qtr4'
lending_club['dti_Category'] = lending_club['dti'].apply(splitDtiCategory)
categorical_Percentage_Distribution('dti_Category', 'Distribution of dti_Category column and ChargedOff Subscriber% on Total')
Grade¶categorical_Percentage_Distribution('grade', 'Distribution of grade column and ChargedOff Subscriber% on Total')